#!/bin/bash

# Nutch executable
NUTCH="${NUTCH:-./bin/nutch}"

# Webgraph classes
WEBGRAPH_PACKAGE="${WEBGRAPH_PACKAGE:-org.apache.nutch.scoring.webgraph}"
WEBGRAPH_CLASS="${WEBGRAPH_CLASS:-$WEBGRAPH_PACKAGE.WebGraph}"
LOOPS_CLASS="${LOOPS_CLASS:-$WEBGRAPH_PACKAGE.Loops}"
LINK_RANK_CLASS="${LINK_RANK_CLASS:-$WEBGRAPH_PACKAGE.LinkRank}"
SCORE_UPDATER_CLASS="${SCORE_UPDATER_CLASS:-$WEBGRAPH_PACKAGE.ScoreUpdater}"
NODE_DUMPER_CLASS="${NODE_DUMPER_CLASS:-$WEBGRAPH_PACKAGE.NodeDumper}"

# Dirs of nutch DBs
CRAWL_DIR="${CRAWL_DIR:-./crawl}"
CRAWL_DB="${CRAWL_DB:-$CRAWL_DIR/crawldb}"
WEBGRAPH_DB="${WEBGRAPH_DB:-$CRAWL_DIR/webgraphdb}"

usage()
{
    local util=$(basename "$0")
    cat <<EOF
Usage: $util
   \`$util' runs webgraph on crawled segments and updates the index with a new \
node score.
   Here some environement variables to configure the script work:
NUTCH               - nutch binary (default: $NUTCH)

CRAWL_DIR           - crawl directory (default: $CRAWL_DIR)
CRAWL_DB            - crawl db directory (default: ./\$CRAWL_DIR/crawldb)
WEBGRAPH_DB         - webgraph db directory (default: ./\$CRAWL_DIR/webgraphdb)

WEBGRAPH_PACKAGE    - prefix of webgraph classes (default: $WEBGRAPH_PACKAGE)
WEBGRAPH_CLASS      - WebGraph Class (default: \$WEBGRAPH_PACKAGE.WebGraph)
LOOPS_CLASS         - Loops Class (default: \$WEBGRAPH_PACKAGE.Loops)
LINK_RANK_CLASS     - LinkRank Class (default: \$WEBGRAPH_PACKAGE.LinkRank)
SCORE_UPDATER_CLASS - ScoreUpdater Class (default: \$WEBGRAPH_PACKAGE.ScoreUpdater)
NODE_DUMPER_CLASS   - NodeDumper Class (default: \$WEBGRAPH_PACKAGE.NodeDumper)
EOF
}

if [ $# -gt 0 ]; then
    usage
    [ "$1" = -h -o "$1" = --help ] && exit 0
    exit 1
fi

set -e

# We hope this code will not be used in the next century
echo "Running WebGraph"
"$NUTCH" "$WEBGRAPH_CLASS" \
    `for i in "$CRAWL_DIR"/segments/20*; do
	echo -segment "$i"
    done` -webgraphdb "$WEBGRAPH_DB"

echo "Running Loops"
"$NUTCH" "$LOOPS_CLASS" -webgraphdb "$WEBGRAPH_DB"

echo "Running LinkRank"
"$NUTCH" "$LINK_RANK_CLASS" -webgraphdb "$WEBGRAPH_DB"

echo "Running ScoreUpdater"
"$NUTCH" "$SCORE_UPDATER_CLASS" -crawldb "$CRAWL_DB" \
    -webgraphdb "$WEBGRAPH_DB"

echo "Running NodeDumper"
"$NUTCH" "$NODE_DUMPER_CLASS" -scores -topn 1000 -webgraphdb "$WEBGRAPH_DB" \
    -output "${WEBGRAPH_DB}/dump/scores"
